Libraries and Imports¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import pearsonr
import warnings
In [2]:
!pip install pyarrow
Requirement already satisfied: pyarrow in ./.env/lib/python3.11/site-packages (21.0.0)
In [3]:
import boto3
from io import StringIO
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

warnings.filterwarnings('ignore')

Spark Configuration¶

Inicializar sesión de Spark¶

spark = SparkSession.builder.appName("VideoGameAnalysis").master("local[*]").getOrCreate()

In [5]:
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
    .appName("Video_Games_R")
    .config("spark.jars.packages", 
            "org.apache.hadoop:hadoop-aws:3.3.2,com.amazonaws:aws-java-sdk-bundle:1.12.367")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider",
            "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")
    .getOrCreate()
)
df = spark.read.parquet("s3a://xideralaws-curso-uriel/parquet/video_game_reviews_cleaned.parquet") #WERE TO LOOK FOR PARQUET
df.printSchema()
df.show(3, truncate=False)
root
 |-- Game Title: string (nullable = true)
 |-- User Rating: double (nullable = true)
 |-- Age Group Targeted: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Developer: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Release Year: long (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Multiplayer: string (nullable = true)
 |-- Game Length (Hours): double (nullable = true)
 |-- Graphics Quality: string (nullable = true)
 |-- Soundtrack Quality: string (nullable = true)
 |-- Story Quality: string (nullable = true)
 |-- User Review Text: string (nullable = true)
 |-- Game Mode: string (nullable = true)

+------------------+-----------+------------------+-----+--------+----------+---------------+------------+---------+-----------+-------------------+----------------+------------------+-------------+---------------------------------------------+---------+
|Game Title        |User Rating|Age Group Targeted|Price|Platform|Developer |Publisher      |Release Year|Genre    |Multiplayer|Game Length (Hours)|Graphics Quality|Soundtrack Quality|Story Quality|User Review Text                             |Game Mode|
+------------------+-----------+------------------+-----+--------+----------+---------------+------------+---------+-----------+-------------------+----------------+------------------+-------------+---------------------------------------------+---------+
|Grand Theft Auto V|36.4       |All Ages          |41.41|PC      |Game Freak|Innersloth     |2015        |Adventure|No         |55.3               |Medium          |Average           |Poor         |Solid game, but too many bugs.               |Offline  |
|The Sims 4        |38.3       |Adults            |57.56|PC      |Nintendo  |Electronic Arts|2015        |Shooter  |Yes        |34.6               |Low             |Poor              |Poor         |Solid game, but too many bugs.               |Offline  |
|Minecraft         |26.8       |Teens             |44.93|PC      |Bungie    |Capcom         |2012        |Adventure|Yes        |13.9               |Low             |Good              |Average      |Great game, but the graphics could be better.|Offline  |
+------------------+-----------+------------------+-----+--------+----------+---------------+------------+---------+-----------+-------------------+----------------+------------------+-------------+---------------------------------------------+---------+
only showing top 3 rows

AWS S3 Configuration¶

In [6]:
# Cliente S3
s3 = boto3.client("s3")

# Variables de configuración S3
bucket_name = "xideralaws-curso-uriel"
object_key = "video_game_reviews.csv"

Data Loading from S3¶

In [7]:
print("=== CARGANDO DATOS DESDE AWS S3 ===")

# Obtener objeto desde S3
response = s3.get_object(Bucket=bucket_name, Key=object_key)

# Leer contenido CSV
csv_content = response["Body"].read().decode("utf-8")

# Crear DataFrame con pandas desde S3
df = pd.read_csv(StringIO(csv_content))

print(f"✅ Datos cargados exitosamente desde S3")
print(f"Bucket: {bucket_name}")
print(f"Archivo: {object_key}")
=== CARGANDO DATOS DESDE AWS S3 ===
✅ Datos cargados exitosamente desde S3
Bucket: xideralaws-curso-uriel
Archivo: video_game_reviews.csv

General Information about the Dataset¶

In [8]:
print("\n=== INFORMACIÓN GENERAL DEL DATASET ===")
print(f"Forma del dataset: {df.shape}")
print(f"\nColumnas: {list(df.columns)}")
print(f"\nTipos de datos:")
print(df.dtypes)
print(f"\nValores faltantes:")
print(df.isnull().sum())

# Estadísticas descriptivas
print("\n=== ESTADÍSTICAS DESCRIPTIVAS ===")
print(df.describe())
=== INFORMACIÓN GENERAL DEL DATASET ===
Forma del dataset: (47774, 18)

Columnas: ['Game Title', 'User Rating', 'Age Group Targeted', 'Price', 'Platform', 'Requires Special Device', 'Developer', 'Publisher', 'Release Year', 'Genre', 'Multiplayer', 'Game Length (Hours)', 'Graphics Quality', 'Soundtrack Quality', 'Story Quality', 'User Review Text', 'Game Mode', 'Min Number of Players']

Tipos de datos:
Game Title                  object
User Rating                float64
Age Group Targeted          object
Price                      float64
Platform                    object
Requires Special Device     object
Developer                   object
Publisher                   object
Release Year                 int64
Genre                       object
Multiplayer                 object
Game Length (Hours)        float64
Graphics Quality            object
Soundtrack Quality          object
Story Quality               object
User Review Text            object
Game Mode                   object
Min Number of Players        int64
dtype: object

Valores faltantes:
Game Title                 0
User Rating                0
Age Group Targeted         0
Price                      0
Platform                   0
Requires Special Device    0
Developer                  0
Publisher                  0
Release Year               0
Genre                      0
Multiplayer                0
Game Length (Hours)        0
Graphics Quality           0
Soundtrack Quality         0
Story Quality              0
User Review Text           0
Game Mode                  0
Min Number of Players      0
dtype: int64

=== ESTADÍSTICAS DESCRIPTIVAS ===
        User Rating         Price  Release Year  Game Length (Hours)  \
count  47774.000000  47774.000000  47774.000000         47774.000000   
mean      29.719329     39.951371   2016.480952            32.481672   
std        7.550131     11.520342      4.027276            15.872508   
min       10.100000     19.990000   2010.000000             5.000000   
25%       24.300000     29.990000   2013.000000            18.800000   
50%       29.700000     39.845000   2016.000000            32.500000   
75%       35.100000     49.957500   2020.000000            46.300000   
max       49.500000     59.990000   2023.000000            60.000000   

       Min Number of Players  
count           47774.000000  
mean                5.116758  
std                 2.769521  
min                 1.000000  
25%                 3.000000  
50%                 5.000000  
75%                 7.000000  
max                10.000000  

Conversion to Spark DataFrame¶

In [9]:
print("\n=== CREANDO SPARK DATAFRAME ===")

# Crear DataFrame de Spark
df_spark = spark.createDataFrame(df)

# Mostrar información del DataFrame de Spark
print("Schema del DataFrame de Spark:")
df_spark.printSchema()

print(f"Número de filas en Spark DataFrame: {df_spark.count()}")
print("Primeras 5 filas:")
df_spark.show(5)
=== CREANDO SPARK DATAFRAME ===
Schema del DataFrame de Spark:
root
 |-- Game Title: string (nullable = true)
 |-- User Rating: double (nullable = true)
 |-- Age Group Targeted: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Requires Special Device: string (nullable = true)
 |-- Developer: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Release Year: long (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Multiplayer: string (nullable = true)
 |-- Game Length (Hours): double (nullable = true)
 |-- Graphics Quality: string (nullable = true)
 |-- Soundtrack Quality: string (nullable = true)
 |-- Story Quality: string (nullable = true)
 |-- User Review Text: string (nullable = true)
 |-- Game Mode: string (nullable = true)
 |-- Min Number of Players: long (nullable = true)

25/08/29 10:10:10 WARN TaskSetManager: Stage 4 contains a task of very large size (1451 KiB). The maximum recommended task size is 1000 KiB.
                                                                                
Número de filas en Spark DataFrame: 47774
Primeras 5 filas:
25/08/29 10:10:13 WARN TaskSetManager: Stage 7 contains a task of very large size (1451 KiB). The maximum recommended task size is 1000 KiB.
+------------------+-----------+------------------+-----+-----------+-----------------------+----------+---------------+------------+---------+-----------+-------------------+----------------+------------------+-------------+--------------------+---------+---------------------+
|        Game Title|User Rating|Age Group Targeted|Price|   Platform|Requires Special Device| Developer|      Publisher|Release Year|    Genre|Multiplayer|Game Length (Hours)|Graphics Quality|Soundtrack Quality|Story Quality|    User Review Text|Game Mode|Min Number of Players|
+------------------+-----------+------------------+-----+-----------+-----------------------+----------+---------------+------------+---------+-----------+-------------------+----------------+------------------+-------------+--------------------+---------+---------------------+
|Grand Theft Auto V|       36.4|          All Ages|41.41|         PC|                     No|Game Freak|     Innersloth|        2015|Adventure|         No|               55.3|          Medium|           Average|         Poor|Solid game, but t...|  Offline|                    1|
|        The Sims 4|       38.3|            Adults|57.56|         PC|                     No|  Nintendo|Electronic Arts|        2015|  Shooter|        Yes|               34.6|             Low|              Poor|         Poor|Solid game, but t...|  Offline|                    3|
|         Minecraft|       26.8|             Teens|44.93|         PC|                    Yes|    Bungie|         Capcom|        2012|Adventure|        Yes|               13.9|             Low|              Good|      Average|Great game, but t...|  Offline|                    5|
| Bioshock Infinite|       38.4|          All Ages|48.29|     Mobile|                    Yes|Game Freak|       Nintendo|        2015|   Sports|         No|               41.9|          Medium|              Good|    Excellent|Solid game, but t...|   Online|                    4|
|   Half-Life: Alyx|       30.1|            Adults|55.49|PlayStation|                    Yes|Game Freak|     Epic Games|        2022|      RPG|        Yes|               13.2|            High|              Poor|         Good|Great game, but t...|  Offline|                    1|
+------------------+-----------+------------------+-----+-----------+-----------------------+----------+---------------+------------+---------+-----------+-------------------+----------------+------------------+-------------+--------------------+---------+---------------------+
only showing top 5 rows

Analysis with Spark¶

In [10]:
print("\n=== ANÁLISIS CON SPARK ===")

# Análisis básico con Spark
print("Columnas del DataFrame de Spark:")
print(df_spark.columns)

# Estadísticas por género usando Spark
print("\nRating promedio por género (usando Spark):")
df_spark.groupBy("Genre").agg(avg("User Rating").alias("avg_rating")).orderBy(desc("avg_rating")).show()

# Top 10 juegos más reseñados usando Spark
print("\nTop 10 juegos más reseñados (usando Spark):")
df_spark.groupBy("Game Title").count().orderBy(desc("count")).show(10)
=== ANÁLISIS CON SPARK ===
Columnas del DataFrame de Spark:
['Game Title', 'User Rating', 'Age Group Targeted', 'Price', 'Platform', 'Requires Special Device', 'Developer', 'Publisher', 'Release Year', 'Genre', 'Multiplayer', 'Game Length (Hours)', 'Graphics Quality', 'Soundtrack Quality', 'Story Quality', 'User Review Text', 'Game Mode', 'Min Number of Players']

Rating promedio por género (usando Spark):
25/08/29 10:10:16 WARN TaskSetManager: Stage 8 contains a task of very large size (1451 KiB). The maximum recommended task size is 1000 KiB.
                                                                                
+----------+------------------+
|     Genre|        avg_rating|
+----------+------------------+
|     Party|29.791343723673183|
|  Fighting|29.786238532110104|
|       RPG|29.766858198235212|
|    Puzzle| 29.75960182496892|
|  Strategy|29.716170125333914|
| Adventure|29.700736842105286|
|    Sports| 29.69997888067586|
|    Action|29.691722353955612|
|Simulation|29.650710702341154|
|   Shooter|29.631074142534402|
+----------+------------------+


Top 10 juegos más reseñados (usando Spark):
25/08/29 10:10:18 WARN TaskSetManager: Stage 11 contains a task of very large size (1451 KiB). The maximum recommended task size is 1000 KiB.
[Stage 11:=============================>                            (1 + 1) / 2]
+--------------------+-----+
|          Game Title|count|
+--------------------+-----+
|Pokémon Scarlet &...| 1274|
|           Minecraft| 1265|
|             FIFA 24| 1247|
|    Street Fighter V| 1245|
|  Tomb Raider (2013)| 1245|
|Super Smash Bros....| 1235|
|           Fall Guys| 1232|
|            Hitman 3| 1229|
|              Tetris| 1224|
|   1000-Piece Puzzle| 1222|
+--------------------+-----+
only showing top 10 rows

                                                                                

Transforming and Cleaning Data¶

In [11]:
print("\n=== LIMPIEZA Y TRANSFORMACIÓN DE DATOS ===")

# Identificar columnas necesarias para el análisis
essential_columns = [
    'Game Title', 'User Rating', 'Age Group Targeted', 'Price', 'Platform',
    'Developer', 'Publisher', 'Release Year', 'Genre', 'Multiplayer',
    'Game Length (Hours)', 'Graphics Quality', 'Soundtrack Quality',
    'Story Quality', 'User Review Text', 'Game Mode'
]

print(f"Columnas originales: {len(df.columns)}")
print(f"Columnas esenciales: {len(essential_columns)}")

# Filtrar solo columnas esenciales
df_cleaned = df[essential_columns].copy()
df_spark_cleaned = spark.createDataFrame(df_cleaned)

print(f"Datos después de limpieza: {df_cleaned.shape}")
=== LIMPIEZA Y TRANSFORMACIÓN DE DATOS ===
Columnas originales: 18
Columnas esenciales: 16
Datos después de limpieza: (47774, 16)

Style Configuration for Visualizations¶

In [12]:
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

Visualizations (14 Charts)¶

1. Temporal Trend (Games per Year)¶

In [20]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 8)
year_counts = df_cleaned['Release Year'].value_counts().sort_index()
plt.plot(year_counts.index, year_counts.values, marker='o', linewidth=2, markersize=4)
plt.title('Games Released per Year', fontsize=14, fontweight='bold')
plt.xlabel('Year')
plt.ylabel('Number of Games')
plt.grid(True, alpha=0.3)
No description has been provided for this image

2. Rating vs Precio (scatter plot)¶

In [18]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 6)
plt.scatter(df_cleaned['Price'], df_cleaned['User Rating'], alpha=0.5, color='purple')
plt.title('Rating vs. Price Relationship', fontsize=14, fontweight='bold')
plt.xlabel('Price ($)')
plt.ylabel('User Rating')
plt.grid(True, alpha=0.3)
No description has been provided for this image

3. User Rating Distribution¶

In [13]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 1)
plt.hist(df_cleaned['User Rating'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('User Rating Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
No description has been provided for this image

4. Top 10 Most Popular Games¶

In [14]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 2)
top_games = df_cleaned['Game Title'].value_counts().head(10)
plt.barh(range(len(top_games)), top_games.values)
plt.yticks(range(len(top_games)), top_games.index, fontsize=10)
plt.title('Top 10 Most Reviewed Games', fontsize=14, fontweight='bold')
plt.xlabel('Number of Reviews')
Out[14]:
Text(0.5, 0, 'Number of Reviews')
No description has been provided for this image

5. Distribution by Platform¶

In [15]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 3)
platform_counts = df_cleaned['Platform'].value_counts()
plt.pie(platform_counts.values, labels=platform_counts.index, autopct='%1.1f%%')
plt.title('Distribution by Platform', fontsize=14, fontweight='bold')
Out[15]:
Text(0.5, 1.0, 'Distribution by Platform')
No description has been provided for this image

6. Average Rating by Genre¶

In [16]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 4)
genre_rating = df_cleaned.groupby('Genre')['User Rating'].mean().sort_values(ascending=False)
plt.bar(range(len(genre_rating)), genre_rating.values, color='lightcoral')
plt.xticks(range(len(genre_rating)), genre_rating.index, rotation=45, ha='right')
plt.title('Average Rating by Genre', fontsize=14, fontweight='bold')
plt.ylabel('Average Rating')
Out[16]:
Text(0, 0.5, 'Average Rating')
No description has been provided for this image

7. Price Distribution¶

In [17]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 5)
plt.hist(df_cleaned['Price'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
plt.title('Price Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
No description has been provided for this image

8. Distribution by Age Group¶

In [19]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 7)
age_counts = df_cleaned['Age Group Targeted'].value_counts()
plt.bar(age_counts.index, age_counts.values, color='orange')
plt.title('Distribution by Age Group', fontsize=14, fontweight='bold')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
Out[19]:
([0, 1, 2, 3],
 [Text(0, 0, 'Teens'),
  Text(1, 0, 'Kids'),
  Text(2, 0, 'All Ages'),
  Text(3, 0, 'Adults')])
No description has been provided for this image

9. Rating by Game Duration¶

In [21]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 9)
plt.scatter(df_cleaned['Game Length (Hours)'], df_cleaned['User Rating'], alpha=0.5, color='brown')
plt.title('Rating vs. Game Duration', fontsize=14, fontweight='bold')
plt.xlabel('Duration (Hours)')
plt.ylabel('User Rating')
plt.grid(True, alpha=0.3)
No description has been provided for this image

10. Top Publishers by Average Rating¶

In [22]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 10)
pub_rating = df_cleaned.groupby('Publisher')['User Rating'].mean().sort_values(ascending=False).head(10)
plt.barh(range(len(pub_rating)), pub_rating.values)
plt.yticks(range(len(pub_rating)), pub_rating.index, fontsize=9)
plt.title('Top 9 Publishers by Rating', fontsize=14, fontweight='bold')
plt.xlabel('Average Rating')
Out[22]:
Text(0.5, 0, 'Average Rating')
No description has been provided for this image

11. Graphics Quality vs. Rating¶

In [23]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 11)
graphics_rating = df_cleaned.groupby('Graphics Quality')['User Rating'].mean()
plt.bar(graphics_rating.index, graphics_rating.values, color='teal')
plt.title('Rating by Graphics Quality', fontsize=14, fontweight='bold')
plt.ylabel('Average Rating')
Out[23]:
Text(0, 0.5, 'Average Rating')
No description has been provided for this image

12. Multiplayer vs. Single Player¶

In [24]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 12)
mp_counts = df_cleaned['Multiplayer'].value_counts()
plt.pie(mp_counts.values, labels=['Yes' if x=='Yes' else 'No' for x in mp_counts.index],autopct='%1.1f%%', colors=['lightblue', 'salmon'])
plt.title('Multiplayer Distribution', fontsize=14, fontweight='bold')
Out[24]:
Text(0.5, 1.0, 'Multiplayer Distribution')
No description has been provided for this image

13. Correlation Heatmap¶

In [25]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 13)
numeric_cols = ['User Rating', 'Price', 'Release Year', 'Game Length (Hours)']
correlation_matrix = df_cleaned[numeric_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,square=True, fmt='.2f')
plt.title('Numerical Variable Correlations', fontsize=14, fontweight='bold')
Out[25]:
Text(0.5, 1.0, 'Numerical Variable Correlations')
No description has been provided for this image

14. Soundtrack Quality Distribution¶

In [26]:
fig = plt.figure(figsize=(20, 25))
plt.subplot(5, 3, 14)
soundtrack_counts = df_cleaned['Soundtrack Quality'].value_counts()
plt.bar(soundtrack_counts.index, soundtrack_counts.values, color='mediumpurple')
plt.title('Soundtrack Quality Distribution', fontsize=14, fontweight='bold')
plt.ylabel('Frequency')
Out[26]:
Text(0, 0.5, 'Frequency')
No description has been provided for this image

ANÁLISIS DE TEXTO DE RESEÑAS (ANÁLISIS DE RESEÑAS DE USUARIOS)¶

In [27]:
reviews_text = ' '.join(df_cleaned['User Review Text'].astype(str))
In [28]:
# Word Cloud
plt.figure(figsize=(15, 8))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(reviews_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud – User Reviews', fontsize=16, fontweight='bold')
plt.show()
No description has been provided for this image
In [29]:
# Palabras más comunes
vectorizer = CountVectorizer(max_features=20, stop_words='english')
word_freq = vectorizer.fit_transform(df_cleaned['User Review Text'].astype(str))
feature_names = vectorizer.get_feature_names_out()
word_counts = word_freq.sum(axis=0).A1

plt.figure(figsize=(12, 6))
plt.bar(feature_names, word_counts, color='lightcoral')
plt.title('Top 20 Most Frequent Words in Reviews', fontsize=14, fontweight='bold')
plt.xlabel('Palabras')
plt.ylabel('Frecuencia')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
No description has been provided for this image

INSIGHTS CLAVE¶

In [30]:
print(f"Rating promedio general: {df_cleaned['User Rating'].mean():.2f}")
print(f"Precio promedio: ${df_cleaned['Price'].mean():.2f}")
print(f"Duración promedio: {df_cleaned['Game Length (Hours)'].mean():.1f} horas")

print(f"\nGénero con mejor rating: {genre_rating.index[0]} ({genre_rating.iloc[0]:.2f})")
print(f"Plataforma más popular: {platform_counts.index[0]} ({platform_counts.iloc[0]} juegos)")

best_publisher = df_cleaned.groupby('Publisher')['User Rating'].mean().sort_values(ascending=False).iloc[0]
best_pub_name = df_cleaned.groupby('Publisher')['User Rating'].mean().sort_values(ascending=False).index[0]
print(f"Publisher con mejor rating promedio: {best_pub_name} ({best_publisher:.2f})")
Rating promedio general: 29.72
Precio promedio: $39.95
Duración promedio: 32.5 horas

Género con mejor rating: Party (29.79)
Plataforma más popular: PlayStation (9633 juegos)
Publisher con mejor rating promedio: Innersloth (29.93)
In [31]:
# Análisis de precios por calidad
print(f"\nPrecio promedio por calidad gráfica:")
for quality in df_cleaned['Graphics Quality'].unique():
    avg_price = df_cleaned[df_cleaned['Graphics Quality']==quality]['Price'].mean()
    print(f"  {quality}: ${avg_price:.2f}")

print(f"\nPorcentaje de juegos multiplayer: {(df_cleaned['Multiplayer']=='Yes').mean()*100:.1f}%")
Precio promedio por calidad gráfica:
  Medium: $39.91
  Low: $39.80
  High: $40.10
  Ultra: $40.00

Porcentaje de juegos multiplayer: 49.6%

Conversion to Parquet Format¶

In [32]:
print("\n=== CONVERSIÓN A FORMATO PARQUET ===")

# 1. DEFINIR VARIABLES CORRECTAMENTE
S3_BUCKET = "xideralaws-curso-uriel"  # ✅ Tu bucket
S3_PREFIX = "parquet"                 # ✅ Carpeta parquet
s3_client = s3  # ✅ Usar el cliente s3 que ya tienes

print(f"🎯 Bucket destino: {S3_BUCKET}")
print(f"📁 Carpeta destino: {S3_PREFIX}")

# 2. CREAR DATAFRAME LIMPIO
print("🔄 Preparando datos para export...")
df_for_spark = df_cleaned.copy()

# 3. VERIFICAR CREDENCIALES AWS
try:
    test_response = s3_client.list_objects_v2(Bucket=S3_BUCKET, MaxKeys=1)
    print("✅ Credenciales AWS verificadas - Acceso a S3 confirmado")
except Exception as e:
    print(f"❌ Error de credenciales AWS: {e}")
    print("💡 Verifica tus credenciales AWS")

# 4. MÉTODO SIMPLE Y CONFIABLE - PANDAS + BOTO3
try:
    from io import BytesIO

    print("📤 Exportando con pandas + boto3...")

    # Crear buffer en memoria
    parquet_buffer = BytesIO()

    # Convertir a parquet en memoria
    df_for_spark.to_parquet(
        parquet_buffer,
        engine='pyarrow',
        compression='snappy',
        index=False
    )
    parquet_buffer.seek(0)

    # Definir clave S3
    s3_key = f"{S3_PREFIX}/video_game_reviews_cleaned.parquet"

    # Subir a S3
    print(f"⬆️ Subiendo a s3://{S3_BUCKET}/{s3_key}")
    s3_client.upload_fileobj(parquet_buffer, S3_BUCKET, s3_key)

    # Verificar que el archivo existe
    response = s3_client.head_object(Bucket=S3_BUCKET, Key=s3_key)
    size_mb = response['ContentLength'] / (1024*1024)

    print(f"🎉 ¡ÉXITO! Dataset exportado a S3")
    print(f"📁 Ubicación: s3://{S3_BUCKET}/{s3_key}")
    print(f"📦 Tamaño: {size_mb:.2f} MB")
    print(f"📅 Última modificación: {response['LastModified']}")
    print(f"🔢 Filas exportadas: {len(df_for_spark)}")
    print(f"🔢 Columnas exportadas: {len(df_for_spark.columns)}")

except Exception as e:
    print(f"❌ Error durante export: {e}")
    print("💡 Revisa las credenciales AWS y permisos del bucket")

# 5. VERIFICAR EN S3 LISTANDO OBJETOS
print("\n=== VERIFICACIÓN FINAL ===")
try:
    print("🔍 Verificando archivos en S3...")
    response = s3_client.list_objects_v2(
        Bucket=S3_BUCKET,
        Prefix=S3_PREFIX
    )

    if 'Contents' in response:
        print(f"✅ Archivos encontrados en s3://{S3_BUCKET}/{S3_PREFIX}/:")
        for obj in response['Contents']:
            size_mb = obj['Size'] / (1024*1024)
            print(f"  📄 {obj['Key']} ({size_mb:.2f} MB) - {obj['LastModified']}")
    else:
        print(f"⚠️ No se encontraron archivos en s3://{S3_BUCKET}/{S3_PREFIX}/")

except Exception as e:
    print(f"❌ Error listando objetos S3: {e}")

# 6. PRUEBA DE LECTURA DESDE S3
print("\n=== PRUEBA DE LECTURA ===")
try:
    s3_key_test = f"{S3_PREFIX}/video_game_reviews_cleaned.parquet"
    print(f"📖 Intentando leer desde s3://{S3_BUCKET}/{s3_key_test}")

    # Leer desde S3
    obj = s3_client.get_object(Bucket=S3_BUCKET, Key=s3_key_test)
    parquet_data = BytesIO(obj['Body'].read())

    # Leer con pandas
    df_test = pd.read_parquet(parquet_data)

    print(f"✅ ¡Lectura exitosa desde S3!")
    print(f"🔢 Filas leídas: {len(df_test)}")
    print(f"🔢 Columnas leídas: {len(df_test.columns)}")
    print("📊 Primeras columnas:", list(df_test.columns[:5]))

except Exception as e:
    print(f"❌ Error leyendo desde S3: {e}")

print("\n🏁 Proceso de conversión a Parquet completado")
=== CONVERSIÓN A FORMATO PARQUET ===
🎯 Bucket destino: xideralaws-curso-uriel
📁 Carpeta destino: parquet
🔄 Preparando datos para export...
✅ Credenciales AWS verificadas - Acceso a S3 confirmado
📤 Exportando con pandas + boto3...
⬆️ Subiendo a s3://xideralaws-curso-uriel/parquet/video_game_reviews_cleaned.parquet
🎉 ¡ÉXITO! Dataset exportado a S3
📁 Ubicación: s3://xideralaws-curso-uriel/parquet/video_game_reviews_cleaned.parquet
📦 Tamaño: 0.43 MB
📅 Última modificación: 2025-08-29 10:12:31+00:00
🔢 Filas exportadas: 47774
🔢 Columnas exportadas: 16

=== VERIFICACIÓN FINAL ===
🔍 Verificando archivos en S3...
✅ Archivos encontrados en s3://xideralaws-curso-uriel/parquet/:
  📄 parquet/video_game_reviews_cleaned.parquet (0.43 MB) - 2025-08-29 10:12:31+00:00

=== PRUEBA DE LECTURA ===
📖 Intentando leer desde s3://xideralaws-curso-uriel/parquet/video_game_reviews_cleaned.parquet
✅ ¡Lectura exitosa desde S3!
🔢 Filas leídas: 47774
🔢 Columnas leídas: 16
📊 Primeras columnas: ['Game Title', 'User Rating', 'Age Group Targeted', 'Price', 'Platform']

🏁 Proceso de conversión a Parquet completado

SUBIR RESULTADO A S3 (OPCIONAL)¶

In [33]:
# Configuración para subir el archivo Parquet a S3
output_bucket = bucket_name
output_key = "processed_data/video_game_reviews_cleaned.parquet"

print(f"Archivo Parquet listo para subir a:")
print(f"Bucket: {output_bucket}")
print(f"Key: {output_key}")
Archivo Parquet listo para subir a:
Bucket: xideralaws-curso-uriel
Key: processed_data/video_game_reviews_cleaned.parquet

Resource Cleanup¶

In [34]:
print("\n=== FINALIZANDO ANÁLISIS ===")
print("✅ Análisis completado exitosamente")
print("✅ Visualizaciones generadas")
print("✅ Análisis de texto realizado")
print("✅ Estadísticas calculadas")
print("✅ Datos convertidos a formato Parquet")
print("✅ Preparado para ejecución en AWS EMR con Spark Jobs")

# Detener sesión de Spark
spark.stop()

print("\n🎯 RESUMEN FINAL:")
print(f"- Dataset original: {df.shape[0]} filas, {df.shape[1]} columnas")
print(f"- Dataset limpio: {df_cleaned.shape[0]} filas, {df_cleaned.shape[1]} columnas")
print(f"- 15 visualizaciones generadas")
print(f"- Análisis de texto completado")
print(f"- Correlaciones calculadas")
print(f"- Archivo Parquet creado y verificado")
print(f"- Listo para producción en AWS EMR")
=== FINALIZANDO ANÁLISIS ===
✅ Análisis completado exitosamente
✅ Visualizaciones generadas
✅ Análisis de texto realizado
✅ Estadísticas calculadas
✅ Datos convertidos a formato Parquet
✅ Preparado para ejecución en AWS EMR con Spark Jobs

🎯 RESUMEN FINAL:
- Dataset original: 47774 filas, 18 columnas
- Dataset limpio: 47774 filas, 16 columnas
- 15 visualizaciones generadas
- Análisis de texto completado
- Correlaciones calculadas
- Archivo Parquet creado y verificado
- Listo para producción en AWS EMR

Evidencia de archivo .parquet guardado en Bucket¶

image.png

image.png

Visualización en Streamlit¶

Code¶

In [ ]:
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# ==============================
# Configuración inicial
# ==============================
st.set_page_config(
    page_title="Video Game Reviews Dashboard",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS for dark theme styling
st.markdown("""
<style>
    .main {
        background-color: #0E1117;
    }

    .stApp {
        background-color: #0E1117;
    }

    .css-1d391kg {
        background-color: #262730;
    }

    .metric-container {
        background-color: #262730;
        padding: 1rem;
        border-radius: 0.5rem;
        margin: 0.5rem 0;
    }

    .stSelectbox > div > div {
        background-color: #262730;
    }

    .uploadedFile {
        background-color: #262730;
    }
</style>
""", unsafe_allow_html=True)

# Title with gaming icon
st.title("🎮 Dashboard de Análisis de Videojuegos")

# Sidebar: cargar archivo CSV
st.sidebar.title("Datos")
st.sidebar.markdown("Sube tu archivo CSV de videojuegos")
st.sidebar.markdown("Drag and drop file here")
st.sidebar.markdown("Limit 200MB per file • CSV")
file = st.sidebar.file_uploader("", type=["csv"], label_visibility="collapsed")

if file is None:
    st.warning("⚠️ Sube un archivo CSV para continuar.")
    st.stop()

# Leer datos
df = pd.read_csv(file)

# ==============================
# KPIs
# ==============================
total_games = df.shape[0]
avg_rating = df['User Rating'].mean()
avg_price = df['Price'].mean()
total_platforms = df['Platform'].nunique()
avg_game_length = df['Game Length (Hours)'].mean()

c1, c2, c3, c4, c5 = st.columns(5)

with c1:
    st.metric("Total Juegos", f"{total_games:,}")
with c2:
    st.metric("Rating Promedio", f"{avg_rating:.1f}/100")
with c3:
    st.metric("Precio Promedio", f"${avg_price:.2f}")
with c4:
    st.metric("Plataformas", total_platforms)
with c5:
    st.metric("Duración Promedio", f"{avg_game_length:.1f} hrs")

st.markdown("---")

# ==============================
# Visualizaciones
# ==============================

# Set matplotlib style for dark theme
plt.style.use('dark_background')

# Row 1: Two columns
cols = st.columns(2)

# --- Distribución de ratings ---
with cols[0]:
    st.subheader("Distribución de Ratings")
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.hist(df['User Rating'], bins=20, alpha=0.7, color='#1f77b4', edgecolor='white')
    ax.set_xlabel("Rating del Usuario")
    ax.set_ylabel("Cantidad de Juegos")
    ax.set_facecolor('#0E1117')
    fig.patch.set_facecolor('#0E1117')
    st.pyplot(fig)

# --- Distribución por plataforma ---
with cols[1]:
    st.subheader("Juegos por Plataforma")
    platform_counts = df['Platform'].value_counts()
    fig, ax = plt.subplots(figsize=(8, 5))
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
    platform_counts.plot(kind='bar', ax=ax, color=colors[:len(platform_counts)])
    ax.set_xlabel("Plataforma")
    ax.set_ylabel("Cantidad de Juegos")
    ax.tick_params(axis='x', rotation=45)
    ax.set_facecolor('#0E1117')
    fig.patch.set_facecolor('#0E1117')
    st.pyplot(fig)

st.markdown("---")

# Row 2: Two columns
cols2 = st.columns(2)

# --- Géneros más populares ---
with cols2[0]:
    st.subheader("Distribución por Género")
    genre_counts = df['Genre'].value_counts().head(8)
    fig, ax = plt.subplots(figsize=(8, 6))
    colors = plt.cm.Set3(np.linspace(0, 1, len(genre_counts)))
    wedges, texts, autotexts = ax.pie(genre_counts.values, labels=genre_counts.index,
                                      autopct='%1.1f%%', colors=colors)
    ax.set_facecolor('#0E1117')
    fig.patch.set_facecolor('#0E1117')
    st.pyplot(fig)

# --- Relación Precio vs Rating ---
with cols2[1]:
    st.subheader("Precio vs Rating")
    fig, ax = plt.subplots(figsize=(8, 6))
    scatter = ax.scatter(df['Price'], df['User Rating'],
                        alpha=0.6, c=df['User Rating'],
                        cmap='viridis', s=50)
    ax.set_xlabel("Precio ($)")
    ax.set_ylabel("Rating del Usuario")
    plt.colorbar(scatter, ax=ax, label='Rating')
    ax.set_facecolor('#0E1117')
    fig.patch.set_facecolor('#0E1117')
    st.pyplot(fig)

st.markdown("---")

# Row 3: Full width
st.subheader("Duración de Juegos por Género")
fig, ax = plt.subplots(figsize=(12, 6))
genre_length = df.groupby('Genre')['Game Length (Hours)'].mean().sort_values(ascending=False)
bars = ax.bar(genre_length.index, genre_length.values,
              color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f'])
ax.set_xlabel("Género")
ax.set_ylabel("Duración Promedio (Horas)")
ax.tick_params(axis='x', rotation=45)
ax.set_facecolor('#0E1117')
fig.patch.set_facecolor('#0E1117')

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 1,
            f'{height:.1f}', ha='center', va='bottom')

st.pyplot(fig)

st.markdown("---")

# Tabla de estadísticas adicionales
st.subheader("Estadísticas por Grupo de Edad")
age_stats = df.groupby('Age Group Targeted').agg({
    'User Rating': ['mean', 'count'],
    'Price': 'mean',
    'Game Length (Hours)': 'mean'
}).round(2)

age_stats.columns = ['Rating Promedio', 'Cantidad', 'Precio Promedio', 'Duración Promedio']
st.dataframe(age_stats, use_container_width=True)

st.markdown("---")
st.markdown("🎮 **Dashboard de Videojuegos** - Análisis completo de reviews y características")

Evidencia del código en Visual¶

image.png

Visualización 1¶

image.png

Visualización 2¶

image.png

Visualización 3¶

image.png

Lambda¶

In [ ]:
import json
import boto3
import pandas as pd
from io import StringIO, BytesIO

def lambda_handler(event, context):

    # Configuración S3
    s3 = boto3.client("s3")
    bucket_name = "xideralaws-curso-uriel"
    input_key = "video_game_reviews.csv"
    output_key = "processed_data/video_game_reviews_cleaned.parquet"

    try:
        # 1. Cargar datos desde S3
        print("=== CARGANDO DATOS DESDE AWS S3 ===")
        response = s3.get_object(Bucket=bucket_name, Key=input_key)
        csv_content = response["Body"].read().decode("utf-8")
        df = pd.read_csv(StringIO(csv_content))
        print(f"✅ Datos cargados exitosamente desde S3")
        print(f"Filas: {df.shape[0]}, Columnas: {df.shape[1]}")

        # 2. Procesamiento básico de datos
        print("=== PROCESANDO DATOS ===")

        # Limpieza de datos básica
        df_cleaned = df.copy()

        # Eliminar duplicados
        initial_rows = df_cleaned.shape[0]
        df_cleaned = df_cleaned.drop_duplicates()
        print(f"Duplicados eliminados: {initial_rows - df_cleaned.shape[0]}")

        # Manejar valores nulos
        null_counts = df_cleaned.isnull().sum()
        print(f"Valores nulos por columna:\n{null_counts}")

        # 3. Convertir a Parquet y guardar en S3
        print("=== EXPORTANDO A PARQUET ===")
        parquet_buffer = BytesIO()
        df_cleaned.to_parquet(
            parquet_buffer,
            engine='pyarrow',
            compression='snappy',
            index=False
        )
        parquet_buffer.seek(0)

        # Subir a S3
        s3.upload_fileobj(parquet_buffer, bucket_name, output_key)

        # Verificar subida
        response = s3.head_object(Bucket=bucket_name, Key=output_key)
        size_mb = response['ContentLength'] / (1024*1024)

        # 4. Preparar respuesta
        result = {
            'statusCode': 200,
            'body': json.dumps({
                'message': 'Procesamiento completado exitosamente',
                'input_file': f"s3://{bucket_name}/{input_key}",
                'output_file': f"s3://{bucket_name}/{output_key}",
                'rows_processed': df_cleaned.shape[0],
                'columns': df_cleaned.shape[1],
                'output_size_mb': round(size_mb, 2),
                'duplicates_removed': initial_rows - df_cleaned.shape[0]
            })
        }

        print("🎉 ¡Procesamiento completado!")
        return result

    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return {
            'statusCode': 500,
            'body': json.dumps({
                'error': 'Error procesando datos',
                'message': str(e)
            })
        }

Evidencia de Lambda en Consola AWS¶

image.png

Architecture Diagram¶

Diagrama simple.png

Evidencia de Sparks¶

Jobs¶

jobs1.png

Enviroment¶

jobs2.png

Executors¶

jobs3.png

Certificaciones¶

Data Engineering on AWS-Foundations¶

image.png

Getting into the Serverless Mindset¶

image.png

AWS Well-Architected Foundations¶

image.png

AWS Cloud Practitioner Essentials¶

image.png

In [ ]: